Clean data
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$survived <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
# # PEER Review MV: You could consider using dplyr to recode these new variables and then also creating factors that make more sense for some of the variables that have multiple variables. For example, numbering factors based on frequency for marital status. This probably isn't very helpful but a little cleaner than code above.
# dat <- dat %>%
# mutate(gender = as.factor(gender),
# marital_status = fct_infreq(marital_status),
# category = as.factor(category),
# class = as.factor(class),
# survived = as.factor(survived),
# embarked = as.factor(embarked),
# disembarked = as.factor(disembarked))
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Swedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
## PEER Review MV: For Nationality, Consider creating a more collapsed factor variable that only has those nationalities with 10 or more individuals and then an other category. Then you could potentially create a cleaner bar graph. Added some code below
# dat <- dat %>%
# mutate(nationality_cat = fct_lump_min(nationality,10),
# nationality_cat = fct_infreq(nationality_cat))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
Descriptives
# Breakdown of passengers by class
dat %>%
group_by(class) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
adorn_totals() %>%
kable(caption = "Breakdown of Passengers by Class",
col.names = c("Class", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_classic(full_width = F, html_font = "Cambria")
Breakdown of Passengers by Class
|
Class
|
Count
|
Percent
|
|
1st Class
|
324
|
24.64
|
|
2nd Class
|
284
|
21.60
|
|
3rd Class
|
707
|
53.76
|
|
Total
|
1315
|
100.00
|
# Breakdown of passengers by class and gender
dat %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Gender", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_classic(full_width = F, html_font = "Cambria")
Breakdown of Passengers by Class and Gender
|
Class
|
Gender
|
Count
|
Percent
|
|
1st Class
|
Female
|
144
|
44.44
|
|
1st Class
|
Male
|
180
|
55.56
|
|
2nd Class
|
Female
|
106
|
37.32
|
|
2nd Class
|
Male
|
178
|
62.68
|
|
3rd Class
|
Female
|
216
|
30.55
|
|
3rd Class
|
Male
|
491
|
69.45
|
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities",
col.names = c("Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling(fixed_thead = T, full_width = F, html_font = "Cambria", bootstrap_options = c("striped", "hover"))
Breakdown of Passenger Nationalities
|
Nationality
|
Count
|
Percent
|
|
English
|
295
|
22.43
|
|
American
|
242
|
18.40
|
|
Irish
|
122
|
9.28
|
|
Other - Multiple
|
108
|
8.21
|
|
Swedish
|
100
|
7.60
|
|
Syrian/Lebanese
|
85
|
6.46
|
|
Finnish
|
58
|
4.41
|
|
Canadian
|
37
|
2.81
|
|
Bulgarian
|
31
|
2.36
|
|
Croatian
|
28
|
2.13
|
|
French
|
26
|
1.98
|
|
Norwegian
|
26
|
1.98
|
|
Belgian
|
25
|
1.90
|
|
Scottish
|
17
|
1.29
|
|
Channel Islander
|
15
|
1.14
|
|
Swiss
|
13
|
0.99
|
|
Danish
|
10
|
0.76
|
|
Italian
|
9
|
0.68
|
|
German
|
8
|
0.61
|
|
Spanish
|
8
|
0.61
|
|
Welsh
|
8
|
0.61
|
|
Polish
|
6
|
0.46
|
|
Bosnian
|
4
|
0.30
|
|
Hong Kongese
|
4
|
0.30
|
|
South African
|
4
|
0.30
|
|
Greek
|
3
|
0.23
|
|
Lithuanian
|
3
|
0.23
|
|
Uruguayan
|
3
|
0.23
|
|
Australian
|
2
|
0.15
|
|
Chinese
|
2
|
0.15
|
|
Portuguese
|
2
|
0.15
|
|
Slovenian
|
2
|
0.15
|
|
Austrian
|
1
|
0.08
|
|
Dutch
|
1
|
0.08
|
|
Egyptian
|
1
|
0.08
|
|
Haitian
|
1
|
0.08
|
|
Hungarian
|
1
|
0.08
|
|
Japanese
|
1
|
0.08
|
|
Latvian
|
1
|
0.08
|
|
Mexican
|
1
|
0.08
|
|
Turkish
|
1
|
0.08
|
# Breakdown of passenger nationalities by class
# # PEER Review MV: - Consider visualizing some of your data that is currently in tables into bar graphs. This one doesn't have all the info on the table, but easier to see which passengers were most represented
# dat %>%
# filter(!is.na(nationality2)) %>%
# group_by(nationality_cat) %>%
# summarize(count = n()) %>%
# mutate(percent = (count/sum(count))*100) %>%
# arrange((percent)) %>%
# ggplot(aes(y = nationality_cat)) +
# geom_col(aes(x = percent), fill = "dark red") +
# geom_text(aes(x = percent, label=round(percent,2)), hjust = -.2, size = 3) +
# theme_minimal() +
# labs(x = "Percentage of all passenges", y = "Nationality")
# Breakdown of passenger nationalities by class (all)
dat %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling(fixed_thead = T, full_width = F, html_font = "Cambria", bootstrap_options = c("striped", "hover"))
Breakdown of Passenger Nationalities by Class (All)
|
Class
|
Nationality
|
Count
|
Percent
|
|
1st Class
|
American
|
195
|
60.19
|
|
1st Class
|
English
|
38
|
11.73
|
|
1st Class
|
Canadian
|
27
|
8.33
|
|
1st Class
|
Other - Multiple
|
14
|
4.32
|
|
1st Class
|
French
|
10
|
3.09
|
|
1st Class
|
Swiss
|
6
|
1.85
|
|
1st Class
|
German
|
5
|
1.54
|
|
1st Class
|
Irish
|
5
|
1.54
|
|
1st Class
|
Spanish
|
4
|
1.23
|
|
1st Class
|
Swedish
|
4
|
1.23
|
|
1st Class
|
Scottish
|
3
|
0.93
|
|
1st Class
|
Uruguayan
|
3
|
0.93
|
|
1st Class
|
Belgian
|
2
|
0.62
|
|
1st Class
|
Italian
|
2
|
0.62
|
|
1st Class
|
Channel Islander
|
1
|
0.31
|
|
1st Class
|
Dutch
|
1
|
0.31
|
|
1st Class
|
Egyptian
|
1
|
0.31
|
|
1st Class
|
Mexican
|
1
|
0.31
|
|
1st Class
|
Norwegian
|
1
|
0.31
|
|
1st Class
|
Polish
|
1
|
0.31
|
|
2nd Class
|
English
|
145
|
51.06
|
|
2nd Class
|
Other - Multiple
|
25
|
8.80
|
|
2nd Class
|
American
|
24
|
8.45
|
|
2nd Class
|
Channel Islander
|
12
|
4.23
|
|
2nd Class
|
Irish
|
12
|
4.23
|
|
2nd Class
|
French
|
11
|
3.87
|
|
2nd Class
|
Scottish
|
8
|
2.82
|
|
2nd Class
|
Finnish
|
6
|
2.11
|
|
2nd Class
|
Swedish
|
6
|
2.11
|
|
2nd Class
|
Canadian
|
5
|
1.76
|
|
2nd Class
|
South African
|
4
|
1.41
|
|
2nd Class
|
Spanish
|
4
|
1.41
|
|
2nd Class
|
Danish
|
3
|
1.06
|
|
2nd Class
|
Italian
|
3
|
1.06
|
|
2nd Class
|
Lithuanian
|
2
|
0.70
|
|
2nd Class
|
Swiss
|
2
|
0.70
|
|
2nd Class
|
Syrian/Lebanese
|
2
|
0.70
|
|
2nd Class
|
Welsh
|
2
|
0.70
|
|
2nd Class
|
Australian
|
1
|
0.35
|
|
2nd Class
|
Belgian
|
1
|
0.35
|
|
2nd Class
|
German
|
1
|
0.35
|
|
2nd Class
|
Haitian
|
1
|
0.35
|
|
2nd Class
|
Hungarian
|
1
|
0.35
|
|
2nd Class
|
Japanese
|
1
|
0.35
|
|
2nd Class
|
Norwegian
|
1
|
0.35
|
|
2nd Class
|
Portuguese
|
1
|
0.35
|
|
3rd Class
|
English
|
112
|
15.84
|
|
3rd Class
|
Irish
|
105
|
14.85
|
|
3rd Class
|
Swedish
|
90
|
12.73
|
|
3rd Class
|
Syrian/Lebanese
|
83
|
11.74
|
|
3rd Class
|
Other - Multiple
|
69
|
9.76
|
|
3rd Class
|
Finnish
|
52
|
7.36
|
|
3rd Class
|
Bulgarian
|
31
|
4.38
|
|
3rd Class
|
Croatian
|
28
|
3.96
|
|
3rd Class
|
Norwegian
|
24
|
3.39
|
|
3rd Class
|
American
|
23
|
3.25
|
|
3rd Class
|
Belgian
|
22
|
3.11
|
|
3rd Class
|
Danish
|
7
|
0.99
|
|
3rd Class
|
Scottish
|
6
|
0.85
|
|
3rd Class
|
Welsh
|
6
|
0.85
|
|
3rd Class
|
Canadian
|
5
|
0.71
|
|
3rd Class
|
French
|
5
|
0.71
|
|
3rd Class
|
Polish
|
5
|
0.71
|
|
3rd Class
|
Swiss
|
5
|
0.71
|
|
3rd Class
|
Bosnian
|
4
|
0.57
|
|
3rd Class
|
Hong Kongese
|
4
|
0.57
|
|
3rd Class
|
Italian
|
4
|
0.57
|
|
3rd Class
|
Greek
|
3
|
0.42
|
|
3rd Class
|
Channel Islander
|
2
|
0.28
|
|
3rd Class
|
Chinese
|
2
|
0.28
|
|
3rd Class
|
German
|
2
|
0.28
|
|
3rd Class
|
Slovenian
|
2
|
0.28
|
|
3rd Class
|
Australian
|
1
|
0.14
|
|
3rd Class
|
Austrian
|
1
|
0.14
|
|
3rd Class
|
Latvian
|
1
|
0.14
|
|
3rd Class
|
Lithuanian
|
1
|
0.14
|
|
3rd Class
|
Portuguese
|
1
|
0.14
|
|
3rd Class
|
Turkish
|
1
|
0.14
|
# Trying pivot wider
dat_class_tidy <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
pivot_wider(
names_from = "class",
values_from = "class"
) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (Wider)",
col.names = c("Nationality", "Count", "Percent", "1st Class", "2nd Class", "3rd Class"),
digits = 2,
booktabs = TRUE) %>%
kable_styling(fixed_thead = T, full_width = F, html_font = "Cambria", bootstrap_options = c("striped", "hover"))
dat_class_tidy
Breakdown of Passenger Nationalities by Class (Wider)
|
Nationality
|
Count
|
Percent
|
1st Class
|
2nd Class
|
3rd Class
|
|
American
|
195
|
60.19
|
1st Class
|
NA
|
NA
|
|
English
|
38
|
11.73
|
1st Class
|
NA
|
NA
|
|
Canadian
|
27
|
8.33
|
1st Class
|
NA
|
NA
|
|
Other - Multiple
|
14
|
4.32
|
1st Class
|
NA
|
NA
|
|
French
|
10
|
3.09
|
1st Class
|
NA
|
NA
|
|
Swiss
|
6
|
1.85
|
1st Class
|
NA
|
NA
|
|
German
|
5
|
1.54
|
1st Class
|
NA
|
NA
|
|
Irish
|
5
|
1.54
|
1st Class
|
NA
|
NA
|
|
Spanish
|
4
|
1.23
|
1st Class
|
NA
|
NA
|
|
Swedish
|
4
|
1.23
|
1st Class
|
NA
|
NA
|
|
Scottish
|
3
|
0.93
|
1st Class
|
NA
|
NA
|
|
Uruguayan
|
3
|
0.93
|
1st Class
|
NA
|
NA
|
|
Belgian
|
2
|
0.62
|
1st Class
|
NA
|
NA
|
|
Italian
|
2
|
0.62
|
1st Class
|
NA
|
NA
|
|
Channel Islander
|
1
|
0.31
|
1st Class
|
NA
|
NA
|
|
Dutch
|
1
|
0.31
|
1st Class
|
NA
|
NA
|
|
Egyptian
|
1
|
0.31
|
1st Class
|
NA
|
NA
|
|
Mexican
|
1
|
0.31
|
1st Class
|
NA
|
NA
|
|
Norwegian
|
1
|
0.31
|
1st Class
|
NA
|
NA
|
|
Polish
|
1
|
0.31
|
1st Class
|
NA
|
NA
|
|
English
|
145
|
51.06
|
NA
|
2nd Class
|
NA
|
|
Other - Multiple
|
25
|
8.80
|
NA
|
2nd Class
|
NA
|
|
American
|
24
|
8.45
|
NA
|
2nd Class
|
NA
|
|
Channel Islander
|
12
|
4.23
|
NA
|
2nd Class
|
NA
|
|
Irish
|
12
|
4.23
|
NA
|
2nd Class
|
NA
|
|
French
|
11
|
3.87
|
NA
|
2nd Class
|
NA
|
|
Scottish
|
8
|
2.82
|
NA
|
2nd Class
|
NA
|
|
Finnish
|
6
|
2.11
|
NA
|
2nd Class
|
NA
|
|
Swedish
|
6
|
2.11
|
NA
|
2nd Class
|
NA
|
|
Canadian
|
5
|
1.76
|
NA
|
2nd Class
|
NA
|
|
South African
|
4
|
1.41
|
NA
|
2nd Class
|
NA
|
|
Spanish
|
4
|
1.41
|
NA
|
2nd Class
|
NA
|
|
Danish
|
3
|
1.06
|
NA
|
2nd Class
|
NA
|
|
Italian
|
3
|
1.06
|
NA
|
2nd Class
|
NA
|
|
Lithuanian
|
2
|
0.70
|
NA
|
2nd Class
|
NA
|
|
Swiss
|
2
|
0.70
|
NA
|
2nd Class
|
NA
|
|
Syrian/Lebanese
|
2
|
0.70
|
NA
|
2nd Class
|
NA
|
|
Welsh
|
2
|
0.70
|
NA
|
2nd Class
|
NA
|
|
Australian
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
Belgian
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
German
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
Haitian
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
Hungarian
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
Japanese
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
Norwegian
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
Portuguese
|
1
|
0.35
|
NA
|
2nd Class
|
NA
|
|
English
|
112
|
15.84
|
NA
|
NA
|
3rd Class
|
|
Irish
|
105
|
14.85
|
NA
|
NA
|
3rd Class
|
|
Swedish
|
90
|
12.73
|
NA
|
NA
|
3rd Class
|
|
Syrian/Lebanese
|
83
|
11.74
|
NA
|
NA
|
3rd Class
|
|
Other - Multiple
|
69
|
9.76
|
NA
|
NA
|
3rd Class
|
|
Finnish
|
52
|
7.36
|
NA
|
NA
|
3rd Class
|
|
Bulgarian
|
31
|
4.38
|
NA
|
NA
|
3rd Class
|
|
Croatian
|
28
|
3.96
|
NA
|
NA
|
3rd Class
|
|
Norwegian
|
24
|
3.39
|
NA
|
NA
|
3rd Class
|
|
American
|
23
|
3.25
|
NA
|
NA
|
3rd Class
|
|
Belgian
|
22
|
3.11
|
NA
|
NA
|
3rd Class
|
|
Danish
|
7
|
0.99
|
NA
|
NA
|
3rd Class
|
|
Scottish
|
6
|
0.85
|
NA
|
NA
|
3rd Class
|
|
Welsh
|
6
|
0.85
|
NA
|
NA
|
3rd Class
|
|
Canadian
|
5
|
0.71
|
NA
|
NA
|
3rd Class
|
|
French
|
5
|
0.71
|
NA
|
NA
|
3rd Class
|
|
Polish
|
5
|
0.71
|
NA
|
NA
|
3rd Class
|
|
Swiss
|
5
|
0.71
|
NA
|
NA
|
3rd Class
|
|
Bosnian
|
4
|
0.57
|
NA
|
NA
|
3rd Class
|
|
Hong Kongese
|
4
|
0.57
|
NA
|
NA
|
3rd Class
|
|
Italian
|
4
|
0.57
|
NA
|
NA
|
3rd Class
|
|
Greek
|
3
|
0.42
|
NA
|
NA
|
3rd Class
|
|
Channel Islander
|
2
|
0.28
|
NA
|
NA
|
3rd Class
|
|
Chinese
|
2
|
0.28
|
NA
|
NA
|
3rd Class
|
|
German
|
2
|
0.28
|
NA
|
NA
|
3rd Class
|
|
Slovenian
|
2
|
0.28
|
NA
|
NA
|
3rd Class
|
|
Australian
|
1
|
0.14
|
NA
|
NA
|
3rd Class
|
|
Austrian
|
1
|
0.14
|
NA
|
NA
|
3rd Class
|
|
Latvian
|
1
|
0.14
|
NA
|
NA
|
3rd Class
|
|
Lithuanian
|
1
|
0.14
|
NA
|
NA
|
3rd Class
|
|
Portuguese
|
1
|
0.14
|
NA
|
NA
|
3rd Class
|
|
Turkish
|
1
|
0.14
|
NA
|
NA
|
3rd Class
|
# PEER Review MV: Here, I think another good opportunity to visualize tables that intersect Nationality and class. I think you are able to more easily see variation in nationality by class. Interesting how Americans were concentrated in first class, and third class varied much more.
# dat %>%
# filter(category == "Passenger") %>%
# filter(!is.na(nationality2)) %>%
# group_by(class, nationality_cat) %>%
# summarize(count = n()) %>%
# mutate(percent = (count/sum(count))*100) %>%
# arrange(class, desc(percent)) %>%
# ggplot(aes(y = nationality_cat)) +
# geom_col(aes(x = percent), fill = "dark red") +
# facet_wrap(~fct_infreq(class)) +
# geom_text(aes(x = percent, label=round(percent,2)), hjust = -.1, size = 3) +
# theme_minimal() +
# labs(x = "Percent of passengers by class", y = "Nationality")
# Average age by class
dat %>%
group_by(class) %>%
summarize(avg_age = mean(age), std_age = sd(age), min_age = min(age),
max_age = max(age)) %>%
kable(caption = "Average Age by Class",
col.names = c("Class", "Average Age", "SD Age", "Minimum Age", "Maximum Age"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Average Age by Class
|
Class
|
Average Age
|
SD Age
|
Minimum Age
|
Maximum Age
|
|
1st Class
|
39.14
|
13.55
|
0
|
71
|
|
2nd Class
|
30.01
|
13.90
|
0
|
71
|
|
3rd Class
|
25.12
|
11.71
|
0
|
74
|